#import libraries
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
from __future__ import division
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.offline as pyoff
pip install xgboost
Requirement already satisfied: xgboost in ./opt/anaconda3/lib/python3.9/site-packages (1.6.2) Requirement already satisfied: scipy in ./opt/anaconda3/lib/python3.9/site-packages (from xgboost) (1.7.3) Requirement already satisfied: numpy in ./opt/anaconda3/lib/python3.9/site-packages (from xgboost) (1.21.5) Note: you may need to restart the kernel to use updated packages.
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
pyoff.init_notebook_mode()
tx_data = pd.read_csv('/Users/hassaniftikhar4472/Downloads/data/LTV.csv')
tx_user = tx_data
tx_cluster = tx_user
tx_cluster['User_ID'] = pd.Series(range(1,tx_cluster.shape[0]))
one_hot_encoded_data = pd.get_dummies(tx_cluster, columns = ['Segment'])
tx_class = one_hot_encoded_data
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)
LTVCluster 1.000000 m6_Revenue 0.678410 Frequency 0.403291 FrequencyCluster 0.379138 Segment_High-Value 0.373549 OverallScore 0.353104 Revenue 0.234242 RecencyCluster 0.227060 RevenueCluster 0.188523 Segment_Mid-Value 0.027161 User_ID -0.050461 Recency -0.243686 Segment_Low-Value -0.249520 Name: LTVCluster, dtype: float64
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1,objective= 'multi:softprob',n_jobs=-1).fit(X_train, y_train, verbose =2)
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))
y_pred = ltv_xgb_model.predict(X_test)
Accuracy of XGB classifier on training set: 0.94 Accuracy of XGB classifier on test set: 0.94
tx_class
| User_ID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | m6_Revenue | LTVCluster | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0 | 3 | 9 | 1 | 0.287363 | 0 | 4 | 0.452209 | 2 | 0 | 0 | 1 |
| 1 | 2.0 | 0 | 3 | 6 | 1 | 0.028388 | 0 | 4 | 0.142762 | 2 | 0 | 0 | 1 |
| 2 | 3.0 | 0 | 3 | 8 | 1 | 0.014870 | 0 | 4 | 0.004203 | 0 | 0 | 0 | 1 |
| 3 | 4.0 | 1 | 3 | 6 | 1 | 0.012853 | 0 | 4 | 0.002923 | 0 | 0 | 0 | 1 |
| 4 | 5.0 | 3 | 3 | 8 | 1 | 0.005576 | 0 | 4 | 0.000000 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5604127 | 5604128.0 | 13 | 2 | 15 | 2 | 36.428239 | 3 | 7 | 0.000000 | 0 | 1 | 0 | 0 |
| 5604128 | 5604129.0 | 13 | 2 | 12 | 2 | 28.100487 | 3 | 7 | 0.000000 | 0 | 1 | 0 | 0 |
| 5604129 | 5604130.0 | 12 | 2 | 16 | 2 | 27.944217 | 3 | 7 | 0.000000 | 0 | 1 | 0 | 0 |
| 5604130 | 5604131.0 | 7 | 2 | 18 | 2 | 24.543765 | 3 | 7 | 0.468071 | 2 | 1 | 0 | 0 |
| 5604131 | NaN | 11 | 2 | 13 | 2 | 17.675474 | 3 | 7 | 0.000000 | 0 | 1 | 0 | 0 |
5604132 rows × 13 columns
tx_class.groupby('LTVCluster').User_ID.count()/tx_class.User_ID.count()
LTVCluster 0 0.937227 1 0.015655 2 0.047117 Name: User_ID, dtype: float64
y_pred = ltv_xgb_model.predict(X_test)
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 0.96 0.99 0.97 262637
1 0.48 0.23 0.31 4405
2 0.48 0.22 0.30 13165
accuracy 0.94 280207
macro avg 0.64 0.48 0.53 280207
weighted avg 0.93 0.94 0.93 280207
X_test
| User_ID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 254700 | 254701.0 | 0 | 3 | 7 | 1 | 0.458909 | 0 | 4 | 0 | 0 | 1 |
| 958295 | 958296.0 | 0 | 3 | 26 | 3 | 0.112509 | 0 | 6 | 1 | 0 | 0 |
| 4794539 | 4794540.0 | 27 | 0 | 1 | 0 | 0.000000 | 0 | 0 | 0 | 1 | 0 |
| 3469837 | 3469838.0 | 13 | 2 | 1 | 0 | 0.000000 | 0 | 2 | 0 | 1 | 0 |
| 4177086 | 4177087.0 | 7 | 2 | 1 | 0 | 0.042354 | 0 | 2 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4825853 | 4825854.0 | 27 | 0 | 1 | 0 | 0.000720 | 0 | 0 | 0 | 1 | 0 |
| 3460000 | 3460001.0 | 13 | 2 | 1 | 0 | 0.000000 | 0 | 2 | 0 | 1 | 0 |
| 3893039 | 3893040.0 | 10 | 2 | 1 | 0 | 0.000000 | 0 | 2 | 0 | 1 | 0 |
| 2787908 | 2787909.0 | 17 | 1 | 2 | 0 | 0.014556 | 0 | 1 | 0 | 1 | 0 |
| 1115561 | 1115562.0 | 0 | 3 | 2 | 0 | 0.003920 | 0 | 3 | 0 | 0 | 1 |
280207 rows × 11 columns
y_test
254700 2
958295 0
4794539 0
3469837 0
4177086 0
..
4825853 0
3460000 0
3893039 0
2787908 0
1115561 0
Name: LTVCluster, Length: 280207, dtype: int64
X
| User_ID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 0 | 3 | 9 | 1 | 0.287363 | 0 | 4 | 0 | 0 | 1 |
| 1 | 2.0 | 0 | 3 | 6 | 1 | 0.028388 | 0 | 4 | 0 | 0 | 1 |
| 2 | 3.0 | 0 | 3 | 8 | 1 | 0.014870 | 0 | 4 | 0 | 0 | 1 |
| 3 | 4.0 | 1 | 3 | 6 | 1 | 0.012853 | 0 | 4 | 0 | 0 | 1 |
| 4 | 5.0 | 3 | 3 | 8 | 1 | 0.005576 | 0 | 4 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5604127 | 5604128.0 | 13 | 2 | 15 | 2 | 36.428239 | 3 | 7 | 1 | 0 | 0 |
| 5604128 | 5604129.0 | 13 | 2 | 12 | 2 | 28.100487 | 3 | 7 | 1 | 0 | 0 |
| 5604129 | 5604130.0 | 12 | 2 | 16 | 2 | 27.944217 | 3 | 7 | 1 | 0 | 0 |
| 5604130 | 5604131.0 | 7 | 2 | 18 | 2 | 24.543765 | 3 | 7 | 1 | 0 | 0 |
| 5604131 | NaN | 11 | 2 | 13 | 2 | 17.675474 | 3 | 7 | 1 | 0 | 0 |
5604132 rows × 11 columns